MyData <- read.csv('/Users/jesusleon/Downloads/london_merged.csv', row.names=1)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
time= as_datetime(rownames(MyData[0]))
MyData<-data.frame(time, MyData)
#stationarity
library(tseries)
adf.test(MyData$cnt) # p-value < 0.05 indicates the TS is stationary
## Warning in adf.test(MyData$cnt): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: MyData$cnt
## Dickey-Fuller = -9.9098, Lag order = 25, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(MyData$cnt)
## Warning in kpss.test(MyData$cnt): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: MyData$cnt
## KPSS Level = 2.8862, Truncation lag parameter = 14, p-value = 0.01
So we can conclude that the time series is stationanry
###Regression model with trend, seasonality, and white noise to make future predictions
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('lubridate')
library('forecast')
#creating a time series with xts library and making a zoo object
library('TSstudio')
library('zoo')
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library('xts')
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
#linear regression forecasting
#creating series trend
MyData$trend <- 1:nrow(MyData)
#creating seasonality based on hours
MyData$seasonal <- factor(hour(MyData$time), ordered = FALSE)
h <- length(MyData$time)/8 # setting a testing partition length
train <- MyData[1:(nrow(MyData) - h), ]
test <- MyData[(nrow(MyData) - h + 1):nrow(MyData), ]
#TREND
md_trend <- lm(cnt ~ trend, data = train)
summary(md_trend)
##
## Call:
## lm(formula = cnt ~ trend, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1295.5 -883.6 -297.3 535.6 6767.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.940e+02 1.772e+01 56.09 <2e-16 ***
## trend 2.195e-02 2.014e-03 10.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1094 on 15235 degrees of freedom
## Multiple R-squared: 0.007737, Adjusted R-squared: 0.007672
## F-statistic: 118.8 on 1 and 15235 DF, p-value: < 2.2e-16
#SEASONALITY
md_seasonality <- lm(cnt ~ seasonal, data = train)
summary(md_seasonality)
##
## Call:
## lm(formula = cnt ~ seasonal, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2829.8 -245.4 -21.3 246.6 4969.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 296.63 27.24 10.890 < 2e-16 ***
## seasonal1 -95.06 38.52 -2.468 0.0136 *
## seasonal2 -158.37 38.57 -4.106 4.04e-05 ***
## seasonal3 -200.67 38.57 -5.203 1.99e-07 ***
## seasonal4 -222.29 38.57 -5.764 8.39e-09 ***
## seasonal5 -184.94 38.58 -4.793 1.66e-06 ***
## seasonal6 175.01 38.51 4.545 5.53e-06 ***
## seasonal7 1183.42 38.51 30.733 < 2e-16 ***
## seasonal8 2598.20 38.54 67.421 < 2e-16 ***
## seasonal9 1358.63 38.49 35.297 < 2e-16 ***
## seasonal10 774.31 38.52 20.101 < 2e-16 ***
## seasonal11 862.79 38.48 22.424 < 2e-16 ***
## seasonal12 1151.20 38.46 29.931 < 2e-16 ***
## seasonal13 1229.11 38.48 31.945 < 2e-16 ***
## seasonal14 1197.62 38.48 31.126 < 2e-16 ***
## seasonal15 1296.46 38.46 33.708 < 2e-16 ***
## seasonal16 1614.87 38.45 42.003 < 2e-16 ***
## seasonal17 2593.50 38.48 67.406 < 2e-16 ***
## seasonal18 2401.95 38.48 62.427 < 2e-16 ***
## seasonal19 1407.11 38.49 36.557 < 2e-16 ***
## seasonal20 795.80 38.49 20.675 < 2e-16 ***
## seasonal21 463.30 38.51 12.032 < 2e-16 ***
## seasonal22 306.15 38.52 7.948 2.04e-15 ***
## seasonal23 151.19 38.55 3.922 8.83e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 685.9 on 15213 degrees of freedom
## Multiple R-squared: 0.6104, Adjusted R-squared: 0.6098
## F-statistic: 1036 on 23 and 15213 DF, p-value: < 2.2e-16
length(md_seasonality$residuals)
## [1] 15237
md_trend.2 <- lm(cnt ~ trend + seasonal + t1 + t2+ hum + wind_speed + factor(weather_code) + factor(season)+ I(is_weekend) +I(is_holiday), data = train)
summary(md_trend.2)
##
## Call:
## lm(formula = cnt ~ trend + seasonal + t1 + t2 + hum + wind_speed +
## factor(weather_code) + factor(season) + I(is_weekend) + I(is_holiday),
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2957.2 -283.9 -24.9 268.5 4258.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.958e+02 5.437e+01 18.316 < 2e-16 ***
## trend 2.066e-03 1.155e-03 1.789 0.073578 .
## seasonal1 -6.979e+01 3.243e+01 -2.152 0.031418 *
## seasonal2 -1.141e+02 3.249e+01 -3.512 0.000446 ***
## seasonal3 -1.407e+02 3.250e+01 -4.328 1.51e-05 ***
## seasonal4 -1.437e+02 3.252e+01 -4.417 1.01e-05 ***
## seasonal5 -9.843e+01 3.255e+01 -3.024 0.002502 **
## seasonal6 2.414e+02 3.254e+01 7.418 1.26e-13 ***
## seasonal7 1.207e+03 3.253e+01 37.093 < 2e-16 ***
## seasonal8 2.559e+03 3.258e+01 78.547 < 2e-16 ***
## seasonal9 1.251e+03 3.271e+01 38.245 < 2e-16 ***
## seasonal10 5.895e+02 3.302e+01 17.852 < 2e-16 ***
## seasonal11 6.157e+02 3.333e+01 18.472 < 2e-16 ***
## seasonal12 8.597e+02 3.358e+01 25.598 < 2e-16 ***
## seasonal13 9.083e+02 3.378e+01 26.890 < 2e-16 ***
## seasonal14 8.587e+02 3.385e+01 25.365 < 2e-16 ***
## seasonal15 9.414e+02 3.381e+01 27.843 < 2e-16 ***
## seasonal16 1.276e+03 3.367e+01 37.887 < 2e-16 ***
## seasonal17 2.287e+03 3.349e+01 68.286 < 2e-16 ***
## seasonal18 2.128e+03 3.316e+01 64.183 < 2e-16 ***
## seasonal19 1.189e+03 3.289e+01 36.164 < 2e-16 ***
## seasonal20 6.240e+02 3.267e+01 19.096 < 2e-16 ***
## seasonal21 3.501e+02 3.252e+01 10.766 < 2e-16 ***
## seasonal22 2.388e+02 3.246e+01 7.357 1.98e-13 ***
## seasonal23 1.174e+02 3.246e+01 3.618 0.000298 ***
## t1 7.531e+01 6.457e+00 11.664 < 2e-16 ***
## t2 -3.200e+01 5.200e+00 -6.154 7.73e-10 ***
## hum -1.308e+01 5.222e-01 -25.055 < 2e-16 ***
## wind_speed -9.147e+00 6.939e-01 -13.182 < 2e-16 ***
## factor(weather_code)2 3.929e+01 1.330e+01 2.954 0.003146 **
## factor(weather_code)3 5.674e-02 1.476e+01 0.004 0.996933
## factor(weather_code)4 -4.966e+01 2.002e+01 -2.481 0.013123 *
## factor(weather_code)7 -2.433e+02 1.801e+01 -13.509 < 2e-16 ***
## factor(weather_code)10 -6.386e+02 1.551e+02 -4.118 3.84e-05 ***
## factor(weather_code)26 1.368e+02 8.209e+01 1.666 0.095657 .
## factor(season)1 1.310e+01 1.672e+01 0.784 0.433211
## factor(season)2 2.741e+01 1.561e+01 1.756 0.079133 .
## factor(season)3 -6.822e+01 1.409e+01 -4.841 1.30e-06 ***
## I(is_weekend) -2.099e+02 1.045e+01 -20.087 < 2e-16 ***
## I(is_holiday) -2.768e+02 3.353e+01 -8.256 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 577.3 on 15197 degrees of freedom
## Multiple R-squared: 0.7242, Adjusted R-squared: 0.7235
## F-statistic: 1023 on 39 and 15197 DF, p-value: < 2.2e-16
train$yhat <- predict(md_trend.2, newdata = train)
test$yhat <- predict(md_trend.2, newdata = test)
#MyData
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_lm <- function(data, train, test, title = NULL){
p <- plot_ly(data = data,
x = ~ time,
y = ~ cnt,
type = "scatter",
mode = "line",
name = "Actual") %>%
add_lines(x = ~ train$time,
y = ~ train$yhat,
line = list(color = "red"),
name = "Fitted") %>%
add_lines(x = ~ test$time,
y = ~ test$yhat,
line = list(color = "green", dash = "dot", width = 3),
name = "Forecasted") %>%
layout(title = title,
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Rides booked"),
legend = list(x = 0.05, y = 0.95))
return(p) }
plot_lm(data = MyData,
train = train,
test = test,
title = "Predicting the Trend (Polynomial) and Seasonal Components of the Series")
#Accuracy of Model: 1.30% Prediction Error on the testing set
mape_md2 <- c(mean(abs(train$cnt - train$yhat) / train$cnt),
mean(abs(test$cnt - test$yhat) / test$cnt))
mape_md2
## [1] Inf 1.202572
#ANOVA
anova(md_trend.2)
## Analysis of Variance Table
##
## Response: cnt
## Df Sum Sq Mean Sq F value Pr(>F)
## trend 1 1.4210e+08 142102653 426.354 < 2.2e-16 ***
## seasonal 23 1.1210e+10 487405330 1462.375 < 2.2e-16 ***
## t1 1 1.0756e+09 1075581828 3227.097 < 2.2e-16 ***
## t2 1 2.2678e+07 22678056 68.042 < 2.2e-16 ***
## hum 1 4.8328e+08 483284498 1450.012 < 2.2e-16 ***
## wind_speed 1 1.0116e+08 101157521 303.506 < 2.2e-16 ***
## factor(weather_code) 6 1.0649e+08 17748200 53.250 < 2.2e-16 ***
## factor(season) 3 1.1904e+07 3967975 11.905 8.772e-08 ***
## I(is_weekend) 1 1.2546e+08 125458600 376.417 < 2.2e-16 ***
## I(is_holiday) 1 2.2719e+07 22719234 68.165 < 2.2e-16 ***
## Residuals 15197 5.0651e+09 333297
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#REsiduals Analysis
checkresiduals(md_trend.2)
##
## Breusch-Godfrey test for serial correlation of order up to 43
##
## data: Residuals
## LM test = 10716, df = 43, p-value < 2.2e-16
We can see that from the residual lag dependance that our model did not capture all of the variation in the data #we try ofseeting this effect by estimating residuals with ARIMA errors
#goal estimate residuals with ARMA process and add it to the model #make auto arima on residulas
MK I
library('forecast')
MyData.ts<-ts(MyData)
#MyData.ts
cnt_md<-auto.arima(MyData.ts[,2])
#cnt_md$residuals
MyDATA.prot.2 <- data.frame(MyData)
MyDATA.prot.2$residuals <- cnt_md$residuals
#head(MyDATA.prot.2)
#uses ARIMA(2,1,1) to model noise
h <- length(MyDATA.prot.2$time)/8 # setting a testing partition length
train <- MyDATA.prot.2[1:(nrow(MyDATA.prot.2) - h), ]
test <- MyDATA.prot.2[(nrow(MyDATA.prot.2) - h + 1):nrow(MyDATA.prot.2), ]
md_pred.res<-lm(cnt ~ trend + seasonal + t1 + hum + wind_speed + factor(weather_code) + factor(season)+ I(is_weekend) +I(is_holiday)+ residuals , data = train)
train$yhat <- predict(md_pred.res, newdata = train) #+ train$residuals
test$yhat <- predict(md_pred.res, newdata = test) #+ test$residuals
plot_lm(data =MyDATA.prot.2,
train = train,
test = test,
title = "Predicting the number of rides booked ")
summary(md_pred.res)
##
## Call:
## lm(formula = cnt ~ trend + seasonal + t1 + hum + wind_speed +
## factor(weather_code) + factor(season) + I(is_weekend) + I(is_holiday) +
## residuals, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2399.3 -260.9 -19.2 258.4 4032.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.227e+03 4.681e+01 26.217 < 2e-16 ***
## trend 2.384e-03 1.073e-03 2.222 0.026275 *
## seasonal1 -1.107e+02 3.016e+01 -3.670 0.000243 ***
## seasonal2 -1.688e+02 3.022e+01 -5.585 2.38e-08 ***
## seasonal3 -2.123e+02 3.025e+01 -7.019 2.33e-12 ***
## seasonal4 -2.317e+02 3.028e+01 -7.652 2.09e-14 ***
## seasonal5 -2.214e+02 3.036e+01 -7.294 3.17e-13 ***
## seasonal6 -4.590e+01 3.081e+01 -1.490 0.136229
## seasonal7 6.145e+02 3.254e+01 18.883 < 2e-16 ***
## seasonal8 1.800e+03 3.398e+01 52.984 < 2e-16 ***
## seasonal9 1.852e+03 3.273e+01 56.571 < 2e-16 ***
## seasonal10 2.966e+02 3.127e+01 9.484 < 2e-16 ***
## seasonal11 4.962e+02 3.108e+01 15.966 < 2e-16 ***
## seasonal12 7.554e+02 3.129e+01 24.143 < 2e-16 ***
## seasonal13 8.361e+02 3.143e+01 26.601 < 2e-16 ***
## seasonal14 7.435e+02 3.156e+01 23.563 < 2e-16 ***
## seasonal15 7.560e+02 3.165e+01 23.882 < 2e-16 ***
## seasonal16 1.018e+03 3.173e+01 32.089 < 2e-16 ***
## seasonal17 1.694e+03 3.338e+01 50.738 < 2e-16 ***
## seasonal18 2.230e+03 3.089e+01 72.188 < 2e-16 ***
## seasonal19 1.401e+03 3.086e+01 45.382 < 2e-16 ***
## seasonal20 6.604e+02 3.038e+01 21.739 < 2e-16 ***
## seasonal21 4.849e+02 3.035e+01 15.975 < 2e-16 ***
## seasonal22 3.134e+02 3.021e+01 10.374 < 2e-16 ***
## seasonal23 1.681e+02 3.019e+01 5.567 2.63e-08 ***
## t1 3.586e+01 1.368e+00 26.203 < 2e-16 ***
## hum -1.352e+01 4.783e-01 -28.276 < 2e-16 ***
## wind_speed -7.733e+00 6.198e-01 -12.477 < 2e-16 ***
## factor(weather_code)2 2.166e+01 1.237e+01 1.751 0.079912 .
## factor(weather_code)3 -1.141e+01 1.372e+01 -0.832 0.405601
## factor(weather_code)4 -5.517e+01 1.861e+01 -2.965 0.003028 **
## factor(weather_code)7 -2.132e+02 1.674e+01 -12.735 < 2e-16 ***
## factor(weather_code)10 -5.255e+02 1.441e+02 -3.646 0.000267 ***
## factor(weather_code)26 1.696e+02 7.627e+01 2.223 0.026207 *
## factor(season)1 3.296e+01 1.542e+01 2.137 0.032596 *
## factor(season)2 3.069e+01 1.451e+01 2.115 0.034433 *
## factor(season)3 -6.288e+01 1.303e+01 -4.824 1.42e-06 ***
## I(is_weekend) -2.060e+02 9.711e+00 -21.215 < 2e-16 ***
## I(is_holiday) -2.782e+02 3.116e+01 -8.928 < 2e-16 ***
## residuals 5.362e-01 1.087e-02 49.346 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 536.6 on 15197 degrees of freedom
## Multiple R-squared: 0.7617, Adjusted R-squared: 0.7611
## F-statistic: 1246 on 39 and 15197 DF, p-value: < 2.2e-16
#plot(md_pred.res)
checkresiduals(md_pred.res)
##
## Breusch-Godfrey test for serial correlation of order up to 43
##
## data: Residuals
## LM test = 10843, df = 43, p-value < 2.2e-16
mape_md_pred.res <- c(mean(abs(train$cnt - train$yhat) / train$cnt),
mean(abs(test$cnt - test$yhat) / test$cnt))
mape_md_pred.res
## [1] Inf 1.113871
#residuals are modeleled by an ARIMA((2,1,1)) linear process
#not actual residulas